Loading Packages

knitr::opts_chunk$set(echo = TRUE)
#load required packages
library(tidyverse)
library(dplyr)
library(ggplot2)
library(Hmisc)
library(knitr)
library(kableExtra)

Basic Data Description

For the purpose of this project, a dataset that consists of credit card customer information has been retrieved from Kaggle.

df <- read.csv("BankChurners.csv", stringsAsFactors = F) #read data
length(df) # number of observations in original dataset
## [1] 23
#remove columns that contain "Naive"
df <- df %>%
  select(-contains('Naive'))

length(df) #number of observations in new dataset
## [1] 21
names(df) #check column names
##  [1] "CLIENTNUM"                "Attrition_Flag"          
##  [3] "Customer_Age"             "Gender"                  
##  [5] "Dependent_count"          "Education_Level"         
##  [7] "Marital_Status"           "Income_Category"         
##  [9] "Card_Category"            "Months_on_book"          
## [11] "Total_Relationship_Count" "Months_Inactive_12_mon"  
## [13] "Contacts_Count_12_mon"    "Credit_Limit"            
## [15] "Total_Revolving_Bal"      "Avg_Open_To_Buy"         
## [17] "Total_Amt_Chng_Q4_Q1"     "Total_Trans_Amt"         
## [19] "Total_Trans_Ct"           "Total_Ct_Chng_Q4_Q1"     
## [21] "Avg_Utilization_Ratio"
summary(df) # summary statistics
##    CLIENTNUM         Attrition_Flag      Customer_Age      Gender         
##  Min.   :708082083   Length:10127       Min.   :26.00   Length:10127      
##  1st Qu.:713036770   Class :character   1st Qu.:41.00   Class :character  
##  Median :717926358   Mode  :character   Median :46.00   Mode  :character  
##  Mean   :739177606                      Mean   :46.33                     
##  3rd Qu.:773143533                      3rd Qu.:52.00                     
##  Max.   :828343083                      Max.   :73.00                     
##  Dependent_count Education_Level    Marital_Status     Income_Category   
##  Min.   :0.000   Length:10127       Length:10127       Length:10127      
##  1st Qu.:1.000   Class :character   Class :character   Class :character  
##  Median :2.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2.346                                                           
##  3rd Qu.:3.000                                                           
##  Max.   :5.000                                                           
##  Card_Category      Months_on_book  Total_Relationship_Count
##  Length:10127       Min.   :13.00   Min.   :1.000           
##  Class :character   1st Qu.:31.00   1st Qu.:3.000           
##  Mode  :character   Median :36.00   Median :4.000           
##                     Mean   :35.93   Mean   :3.813           
##                     3rd Qu.:40.00   3rd Qu.:5.000           
##                     Max.   :56.00   Max.   :6.000           
##  Months_Inactive_12_mon Contacts_Count_12_mon  Credit_Limit  
##  Min.   :0.000          Min.   :0.000         Min.   : 1438  
##  1st Qu.:2.000          1st Qu.:2.000         1st Qu.: 2555  
##  Median :2.000          Median :2.000         Median : 4549  
##  Mean   :2.341          Mean   :2.455         Mean   : 8632  
##  3rd Qu.:3.000          3rd Qu.:3.000         3rd Qu.:11068  
##  Max.   :6.000          Max.   :6.000         Max.   :34516  
##  Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt
##  Min.   :   0        Min.   :    3   Min.   :0.0000       Min.   :  510  
##  1st Qu.: 359        1st Qu.: 1324   1st Qu.:0.6310       1st Qu.: 2156  
##  Median :1276        Median : 3474   Median :0.7360       Median : 3899  
##  Mean   :1163        Mean   : 7469   Mean   :0.7599       Mean   : 4404  
##  3rd Qu.:1784        3rd Qu.: 9859   3rd Qu.:0.8590       3rd Qu.: 4741  
##  Max.   :2517        Max.   :34516   Max.   :3.3970       Max.   :18484  
##  Total_Trans_Ct   Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
##  Min.   : 10.00   Min.   :0.0000      Min.   :0.0000       
##  1st Qu.: 45.00   1st Qu.:0.5820      1st Qu.:0.0230       
##  Median : 67.00   Median :0.7020      Median :0.1760       
##  Mean   : 64.86   Mean   :0.7122      Mean   :0.2749       
##  3rd Qu.: 81.00   3rd Qu.:0.8180      3rd Qu.:0.5030       
##  Max.   :139.00   Max.   :3.7140      Max.   :0.9990
#check the number of unique client number
length(unique(df[,"CLIENTNUM"])) # total rows are 10127, so there is no duplicates in client number
## [1] 10127
#check for the number of NA, 0, missing values for each column
sapply(df, function(x) sum(is.na(x) | x == 0 | x == ""))
##                CLIENTNUM           Attrition_Flag             Customer_Age 
##                        0                        0                        0 
##                   Gender          Dependent_count          Education_Level 
##                        0                      904                        0 
##           Marital_Status          Income_Category            Card_Category 
##                        0                        0                        0 
##           Months_on_book Total_Relationship_Count   Months_Inactive_12_mon 
##                        0                        0                       29 
##    Contacts_Count_12_mon             Credit_Limit      Total_Revolving_Bal 
##                      399                        0                     2470 
##          Avg_Open_To_Buy     Total_Amt_Chng_Q4_Q1          Total_Trans_Amt 
##                        0                        5                        0 
##           Total_Trans_Ct      Total_Ct_Chng_Q4_Q1    Avg_Utilization_Ratio 
##                        0                        7                     2470

The original dataset had 10127 rows and 23 columns. As suggested from the author of dataset, the last two columns containing the word “Naive” has been removed from the dataset, resulting a dataset with 10127 rows and 21 columns. There was no duplicated values in ‘CLIENTNUM’ column. A considerable amount of NA values has been found in a number of columns, and they will be pre-processed as we go along.

Outlier Removal

This section will focus on removing outliers from the dataset to produce statistically significant results. Scatter plot for each column will be created to identify existing outliers.

cols <- colnames(df) # column names
cols <- cols[-1] # exclude 'CLIENTNUM' column from outlier removal process
par(mfrow=c(4,5))
for (i in cols){
  plot <- ggplot(df, aes_string(x = df$CLIENTNUM, y = i)) + geom_point()
  print(plot)
}

Looking at the scatter plot of each column, I could observe some outlier data points on a few columns - ‘Customer_Age’, ‘Tota_Amt_Chng_Q4_Q1’, ‘Total_Ct_Chng_Q4_Q1’.

# identify and remove outliers using box plots
df1 <- df[!(df$Customer_Age %in% boxplot(df$Customer_Age)$out),]

df1 <- df[!(df$Total_Amt_Chng_Q4_Q1 %in% boxplot(df$Total_Amt_Chng_Q4_Q1)$out),]

df1 <- df[!(df$Total_Ct_Chng_Q4_Q1 %in% boxplot(df$Total_Ct_Chng_Q4_Q1)$out),]

cols <- colnames(df1) # column names
cols <- cols[-1] # exclude 'CLIENTNUM' column from outlier removal process
par(mfrow=c(4,5))
for (i in cols){
    plot <- ggplot(df1, aes_string(x = df1$CLIENTNUM, y = i)) +
    geom_point()
    print(plot)
}

#par(mfrow=c(2,7))
#attach(df1)
#scatter1 <- plot(CLIENTNUM, Customer_Age)
#scatter2 <- plot(CLIENTNUM, Dependent_count)
#scatter3 <- plot(CLIENTNUM, Months_on_book)
#scatter4 <- plot(CLIENTNUM, Total_Relationship_Count)
#scatter5 <- plot(CLIENTNUM, Months_Inactive_12_mon)
#scatter6 <- plot(CLIENTNUM, Contacts_Count_12_mon)
#scatter7 <- plot(CLIENTNUM, Credit_Limit)
#scatter8 <- plot(CLIENTNUM, Total_Revolving_Bal)
#scatter9 <- plot(CLIENTNUM, Avg_Open_To_Buy)
#scatter10 <- plot(CLIENTNUM, Total_Amt_Chng_Q4_Q1)
#scatter11 <- plot(CLIENTNUM, Total_Trans_Amt)
#scatter12 <- plot(CLIENTNUM, Total_Trans_Ct)
#scatter13 <- plot(CLIENTNUM, Total_Ct_Chng_Q4_Q1)
#scatter14 <- plot(CLIENTNUM, Avg_Utilization_Ratio)

Binning

Some of the columns in the dataset consists of continuous variables, and that will need some transformation before further analysis. The following section will show the process of binning continuous variables into specific groups.

Age

The original ‘Customer_Age’ column is made of continuous age variables ranging from 20 to 80. In this section, the variable will be categorized into a total of 6 groups.

min(df1$Customer_Age) # minimum value of 'Customer_Age' column
## [1] 26
max(df1$Customer_Age) # maximum value
## [1] 70
age_labels <- c("20-29","30-39","40-49","50-59","60-69","70+") # define groups
df2 <- df1 %>% # break 'Customer_Age' column data into defined groups and store it as a new column
  mutate(Age = cut(Customer_Age,
                   breaks = c(20,30,40,50,60,70,80), 
                   right = F, 
                   labels = age_labels)) 

#check the age binning by looking at 10 randomly chosen data subset
age_check <- df2 %>%
  select(c(Customer_Age,Age)) %>%
  sample_n(10)

age_check # correctly done!
##    Customer_Age   Age
## 1            57 50-59
## 2            51 50-59
## 3            40 40-49
## 4            52 50-59
## 5            30 30-39
## 6            48 40-49
## 7            53 50-59
## 8            49 40-49
## 9            46 40-49
## 10           30 30-39
#remove continuous age value column
df2 <- df2 %>%
  relocate(Age, .after = Customer_Age) %>%
  select(-Customer_Age)

df2[1:5,] # check
##   CLIENTNUM    Attrition_Flag   Age Gender Dependent_count Education_Level
## 1 713061558 Existing Customer 40-49      M               2        Graduate
## 2 810347208 Existing Customer 50-59      M               4         Unknown
## 3 818906208 Existing Customer 30-39      M               0     High School
## 4 719661558 Existing Customer 40-49      M               2        Graduate
## 5 708790833 Existing Customer 40-49      M               5      Uneducated
##   Marital_Status Income_Category Card_Category Months_on_book
## 1        Married     $40K - $60K          Blue             36
## 2        Married         $120K +          Gold             46
## 3        Unknown     $60K - $80K        Silver             27
## 4         Single    $80K - $120K          Blue             36
## 5        Unknown         $120K +          Blue             31
##   Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon
## 1                        3                      1                     2
## 2                        6                      1                     3
## 3                        2                      2                     2
## 4                        6                      3                     3
## 5                        5                      3                     2
##   Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1
## 1         4010                1247            2763                1.376
## 2        34516                2264           32252                1.975
## 3        29081                1396           27685                2.204
## 4        11656                1677            9979                1.524
## 5         6748                1467            5281                0.831
##   Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## 1            1088             24               0.846                 0.311
## 2            1330             31               0.722                 0.066
## 3            1538             36               0.714                 0.048
## 4            1441             32               0.882                 0.144
## 5            1201             42               0.680                 0.217

Number of Dependents

This section will categorize the ‘Dependent_count’ column which consists of number of dependents data into 4 different groups.

min(df2$Dependent_count) # min value of 'Dependent_count' column
## [1] 0
max(df2$Dependent_count) # max value
## [1] 5
dependent_labels <- c("0","1-2","3-4","5+") # define binning groups
df2 <- df2 %>% # break the column into defined groups of data and store it as a new column
  mutate(Dependent_Count = cut(Dependent_count,
                               breaks = c(0,1,3,5,10), 
                               right = F, 
                               labels = dependent_labels)) 

#check the dependent count binning by looking at randomly chosen data subset
dependent_check <- df2 %>%
  select(c(Dependent_count,Dependent_Count)) %>%
  sample_n(10)

dependent_check
##    Dependent_count Dependent_Count
## 1                3             3-4
## 2                2             1-2
## 3                3             3-4
## 4                4             3-4
## 5                2             1-2
## 6                2             1-2
## 7                3             3-4
## 8                2             1-2
## 9                3             3-4
## 10               0               0
df2 <- df2 %>% #remove continuous dependent count value column
  relocate(Dependent_Count, .after = Dependent_count) %>%
  select(-Dependent_count)
df2[1:5,] # check
##   CLIENTNUM    Attrition_Flag   Age Gender Dependent_Count Education_Level
## 1 713061558 Existing Customer 40-49      M             1-2        Graduate
## 2 810347208 Existing Customer 50-59      M             3-4         Unknown
## 3 818906208 Existing Customer 30-39      M               0     High School
## 4 719661558 Existing Customer 40-49      M             1-2        Graduate
## 5 708790833 Existing Customer 40-49      M              5+      Uneducated
##   Marital_Status Income_Category Card_Category Months_on_book
## 1        Married     $40K - $60K          Blue             36
## 2        Married         $120K +          Gold             46
## 3        Unknown     $60K - $80K        Silver             27
## 4         Single    $80K - $120K          Blue             36
## 5        Unknown         $120K +          Blue             31
##   Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon
## 1                        3                      1                     2
## 2                        6                      1                     3
## 3                        2                      2                     2
## 4                        6                      3                     3
## 5                        5                      3                     2
##   Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1
## 1         4010                1247            2763                1.376
## 2        34516                2264           32252                1.975
## 3        29081                1396           27685                2.204
## 4        11656                1677            9979                1.524
## 5         6748                1467            5281                0.831
##   Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## 1            1088             24               0.846                 0.311
## 2            1330             31               0.722                 0.066
## 3            1538             36               0.714                 0.048
## 4            1441             32               0.882                 0.144
## 5            1201             42               0.680                 0.217

Create Tables for Analysis

The following section will examine and compare quantitative values, and look for any meaningful insights from the dataset.

# Check for NA values in column
#check for the number of NA, 0, missing values for each column
sapply(df, function(x) sum(is.na(x) | x == 0 | x == ""))
##                CLIENTNUM           Attrition_Flag             Customer_Age 
##                        0                        0                        0 
##                   Gender          Dependent_count          Education_Level 
##                        0                      904                        0 
##           Marital_Status          Income_Category            Card_Category 
##                        0                        0                        0 
##           Months_on_book Total_Relationship_Count   Months_Inactive_12_mon 
##                        0                        0                       29 
##    Contacts_Count_12_mon             Credit_Limit      Total_Revolving_Bal 
##                      399                        0                     2470 
##          Avg_Open_To_Buy     Total_Amt_Chng_Q4_Q1          Total_Trans_Amt 
##                        0                        5                        0 
##           Total_Trans_Ct      Total_Ct_Chng_Q4_Q1    Avg_Utilization_Ratio 
##                        0                        7                     2470
sapply(df2, function(x) sum(is.na(x) | x == 0 | x == ""))
##                CLIENTNUM           Attrition_Flag                      Age 
##                        0                        0                        0 
##                   Gender          Dependent_Count          Education_Level 
##                        0                      863                        0 
##           Marital_Status          Income_Category            Card_Category 
##                        0                        0                        0 
##           Months_on_book Total_Relationship_Count   Months_Inactive_12_mon 
##                        0                        0                       29 
##    Contacts_Count_12_mon             Credit_Limit      Total_Revolving_Bal 
##                      375                        0                     2383 
##          Avg_Open_To_Buy     Total_Amt_Chng_Q4_Q1          Total_Trans_Amt 
##                        0                        0                        0 
##           Total_Trans_Ct      Total_Ct_Chng_Q4_Q1    Avg_Utilization_Ratio 
##                        0                        0                     2383

Gender and Age

In this section, the dataset has been grouped by gender and age. I then calculated the mean utilization ratio for each group to compare card usage across diferent gender and age groups.

#group by biography info
#group by gender and age for utilization ratio comparison
summary <- df2 %>%
  group_by(Gender, Age) %>%
  summarise(mean_util_ratio = mean(Avg_Utilization_Ratio))
## `summarise()` regrouping output by 'Gender' (override with `.groups` argument)
kable(summary)
Gender Age mean_util_ratio
F 20-29 0.3618587
F 30-39 0.3424089
F 40-49 0.3317936
F 50-59 0.3536289
F 60-69 0.3405152
M 20-29 0.2891939
M 30-39 0.2306788
M 40-49 0.1736448
M 50-59 0.1949627
M 60-69 0.2813291
M 70+ 0.4600000

Marital status

dlkfnalsdnflsnflansdlfnaflansflanlafnlanlnflnlna

#group by family info
#group by marital status for multiple comparisons
summary2 <- df2 %>%
  group_by(Marital_Status) %>%
  summarise(mean_inactive_months_count = mean(Months_Inactive_12_mon),
            mean_util_ratio = mean(Avg_Utilization_Ratio),
            mean_trans_count = mean(Total_Trans_Ct))
## `summarise()` ungrouping output (override with `.groups` argument)
kable(summary2)
Marital_Status mean_inactive_months_count mean_util_ratio mean_trans_count
Divorced 2.356354 0.2576823 67.92265
Married 2.331089 0.2882040 63.03816
Single 2.354031 0.2647613 68.52257
Unknown 2.321775 0.2576352 68.18585
#married who has the highest utilization ratio are least likely to be inactive for 12 months

Dependents

dlkfnalsdnflsnflansdlfnaflansflanlafnlanlnflnlna

#group by number of dependents
summary3 <- df2 %>%
  group_by(Dependent_Count) %>%
  summarise(mean_util_ratio = mean(Avg_Utilization_Ratio), 
            mean_trans_count = mean(Total_Trans_Ct),
            mean_revolving_balance = mean(Total_Revolving_Bal))
## `summarise()` ungrouping output (override with `.groups` argument)
kable(summary3)
Dependent_Count mean_util_ratio mean_trans_count mean_revolving_balance
0 0.2914751 62.78795 1133.487
1-2 0.2809137 65.95606 1169.151
3-4 0.2645555 66.14624 1151.157
5+ 0.2699658 70.39853 1189.548

Card Class

dlkfnalsdnflsnflansdlfnaflansflanlafnlanlnflnlna

#group by card info
#calculated revolving balance percentage for better comparison
summary4 <- df2 %>%
  group_by(Card_Category) %>%
  summarise(mean_util_ratio = mean(Avg_Utilization_Ratio),
            mean_credit_limit = mean(Credit_Limit), 
            mean_revolving_balance = mean(Total_Revolving_Bal),
            revolving_balance_perc = (mean_revolving_balance/mean_credit_limit)*100) %>%
    arrange(match(Card_Category, c("Blue","Silver","Platinum","Gold")))
## `summarise()` ungrouping output (override with `.groups` argument)
kable(summary4)
Card_Category mean_util_ratio mean_credit_limit mean_revolving_balance revolving_balance_perc
Blue 0.2904983 7352.228 1153.649 15.691143
Silver 0.0571434 25272.415 1207.188 4.776702
Platinum 0.0436500 30283.450 1267.950 4.186940
Gold 0.0576786 28534.348 1358.750 4.761805

Education Level

dlkfnalsdnflsnflansdlfnaflansflanlafnlanlnflnlna

#group by education level for comparison of credit limit and revolving balance
summary5 <- df2 %>%
  group_by(Education_Level) %>%
  summarise(mean_credit_limit = mean(Credit_Limit), mean_revolving_balance = mean(Total_Revolving_Bal)) %>%
  arrange(match(Education_Level, c("Doctorate","Post-Graduate","Graduate","College",
                                   "High School","Uneducated","Unknown")))
## `summarise()` ungrouping output (override with `.groups` argument)
kable(summary5)
Education_Level mean_credit_limit mean_revolving_balance
Doctorate 8392.653 1084.402
Post-Graduate 8945.505 1192.671
Graduate 8592.193 1154.717
College 8640.953 1132.302
High School 8563.104 1192.357
Uneducated 8830.205 1154.027
Unknown 8568.678 1157.905